NYC Weather Data: Temperature analysis

nyc_weather <- tibble(read.csv("data/NYC_weather_1869_2022.csv")) %>%
  dplyr::select(TMAX, TMIN, DATE, PRCP, SNOW)
# view structure
str(nyc_weather)
## tibble [56,093 × 5] (S3: tbl_df/tbl/data.frame)
##  $ TMAX: int [1:56093] 25 26 36 43 44 35 36 26 38 46 ...
##  $ TMIN: int [1:56093] 17 4 19 28 19 11 20 20 26 32 ...
##  $ DATE: chr [1:56093] "1869-02-28" "1869-03-01" "1869-03-02" "1869-03-03" ...
##  $ PRCP: num [1:56093] 0 0 0.02 0 0.1 0 0.03 0 0 0 ...
##  $ SNOW: num [1:56093] 0 0 0 0 0 0 0.8 0 0 0 ...
# get summary
xkablesummary(nyc_weather)
Table: Statistics summary.
TMAX TMIN DATE PRCP SNOW
Min Min. : 2.0 Min. :-15.0 Length:56093 Min. :0.00 Min. : 0.0
Q1 1st Qu.: 46.0 1st Qu.: 34.0 Class :character 1st Qu.:0.00 1st Qu.: 0.0
Median Median : 63.0 Median : 47.0 Mode :character Median :0.00 Median : 0.0
Mean Mean : 61.5 Mean : 46.8 NA Mean :0.12 Mean : 0.1
Q3 3rd Qu.: 78.0 3rd Qu.: 62.0 NA 3rd Qu.:0.05 3rd Qu.: 0.0
Max Max. :106.0 Max. : 87.0 NA Max. :8.28 Max. :27.3
NA NA’s :7 NA’s :7 NA NA NA’s :163

Convert DATE column from char to Date type

nyc_weather <- nyc_weather %>%
  mutate(DATE = as.Date(DATE, format="%Y-%m-%d"))

# review dataset structure
xkablesummary(nyc_weather)
Table: Statistics summary.
TMAX TMIN DATE PRCP SNOW
Min Min. : 2.0 Min. :-15.0 Min. :1869-02-28 Min. :0.00 Min. : 0.0
Q1 1st Qu.: 46.0 1st Qu.: 34.0 1st Qu.:1907-07-23 1st Qu.:0.00 1st Qu.: 0.0
Median Median : 63.0 Median : 47.0 Median :1945-12-13 Median :0.00 Median : 0.0
Mean Mean : 61.5 Mean : 46.8 Mean :1945-12-13 Mean :0.12 Mean : 0.1
Q3 3rd Qu.: 78.0 3rd Qu.: 62.0 3rd Qu.:1984-05-05 3rd Qu.:0.05 3rd Qu.: 0.0
Max Max. :106.0 Max. : 87.0 Max. :2022-09-26 Max. :8.28 Max. :27.3
NA NA’s :7 NA’s :7 NA NA NA’s :163

Add a MONTH column to the dataset. The month is extracted from the DATE.
Subset data for DATE > 1900

nyc_weather <- nyc_weather %>%
  filter(lubridate::year(DATE) >= 1900) %>%
  mutate(MONTH = lubridate::month(DATE, label=T))

str(nyc_weather)
## tibble [44,829 × 6] (S3: tbl_df/tbl/data.frame)
##  $ TMAX : int [1:44829] 20 23 26 32 39 40 43 40 33 41 ...
##  $ TMIN : int [1:44829] 14 13 19 19 29 32 31 20 15 30 ...
##  $ DATE : Date[1:44829], format: "1900-01-01" "1900-01-02" ...
##  $ PRCP : num [1:44829] 0.03 0 0 0 0 0 0 0.01 0 0 ...
##  $ SNOW : num [1:44829] 0.5 0 0 0 0 0 0 0 0 0 ...
##  $ MONTH: Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
# review dataset structure
xkablesummary(nyc_weather)
Table: Statistics summary.
TMAX TMIN DATE PRCP SNOW MONTH
Min Min. : 2.0 Min. :-15.0 Min. :1900-01-01 Min. :0.00 Min. : 0.0 Jan : 3813
Q1 1st Qu.: 47.0 1st Qu.: 34.0 1st Qu.:1930-09-08 1st Qu.:0.00 1st Qu.: 0.0 Mar : 3813
Median Median : 63.0 Median : 48.0 Median :1961-05-15 Median :0.00 Median : 0.0 May : 3813
Mean Mean : 62.1 Mean : 47.2 Mean :1961-05-15 Mean :0.13 Mean : 0.1 Jul : 3813
Q3 3rd Qu.: 78.0 3rd Qu.: 62.0 3rd Qu.:1992-01-20 3rd Qu.:0.05 3rd Qu.: 0.0 Aug : 3813
Max Max. :106.0 Max. : 87.0 Max. :2022-09-26 Max. :7.57 Max. :27.3 Oct : 3782
NA NA NA NA NA NA’s :96 (Other):21982

Temperature Plots

mean_tmax <- mean(nyc_weather$TMAX)
mean_tmin <- mean(nyc_weather$TMIN)

ggplot(nyc_weather) +
  geom_histogram(aes(x=TMAX, fill="Max Temp."), na.rm=TRUE, alpha=0.5, color="black", bins=100, binwidth=2) +
  geom_vline(xintercept=mean_tmax, color="red", size=1, linetype=5, show.legend=FALSE) +
  annotate("text", x=mean_tmax + 9, y=2000, label=paste(round(mean_tmax, 2), "°F" ), angle=0, size=4, color="darkred") +
  geom_histogram(aes(x=TMIN, fill="Min Temp."), na.rm=TRUE, alpha=0.5, color="black", bins=100, binwidth=2) +
  geom_vline(xintercept=mean_tmin, color="blue", size=1, linetype=5, show.legend=FALSE) +
  annotate("text", x=mean_tmin - 9, y=2000, label=paste(round(mean_tmin, 2), "°F" ), angle=0, size=4, color="darkblue") +
  labs(title="Distribution of Minimum and Maximum Daily Temperatures", x="Temperature (°F)", y="Count") +
  scale_y_continuous(position = "right") +
  scale_fill_manual(name="Column", values=c("Max Temp."="red","Min Temp."="blue"))

nyc_weather %>%
  ggplot(aes(y=TMAX)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black", fill="red") +
  labs(title="TMAX Boxplot", y="Temperature (°F)")

nyc_weather %>%
  ggplot(aes(y=TMIN)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black", fill="steelblue") +
  labs(title="TMIN Boxplot", x="", y="Temperature (°F)")

qqnorm(nyc_weather$TMAX,
       main="QQ Plot TMAX",
       xlab="Theoretical Quantile",
       ylab="Temperature  (°F)",
       col="red")
qqline(nyc_weather$TMAX, lwd=1, lty=2)

qqnorm(nyc_weather$TMIN,
       main="QQ Plot TMIN",
       xlab="Theoretical Quantile",
       ylab="Temperature  (°F)",
       col="blue")
qqline(nyc_weather$TMIN, lwd=1, lty=2)

Average daily temperature max and min by month, all time

# group data by month
# and calculate average
nyc_weather_by_month <- nyc_weather %>%
  group_by(MONTH) %>%
  summarize("Avg Max Temp." = mean(TMAX, na.rm=T),
            "Avg Min Temp." = mean(TMIN, na.rm=T),
            avg_prcp = mean(PRCP, na.rm=T),
            avg_snow = mean(SNOW, na.rm=T))

# side-by-side bar plot
nyc_weather_by_month %>%
  dplyr::select(MONTH, "Avg Max Temp.", "Avg Min Temp.") %>%
  gather(key="Value", value="Temp.", "Avg Max Temp.", "Avg Min Temp.") %>%
  ggplot(aes(x=MONTH, y=Temp., fill=Value)) +
  geom_col(na.rm=TRUE, alpha=0.5, color="black", position="dodge") +
  labs(title="Average Daily Temperature By Month", x="Month", y="Average Temperature (°F)") +
  scale_fill_manual(values=c("blue", "red"))

Average percipitation and snow fall by month, all time

nyc_weather_by_month %>%
  dplyr::select(MONTH, avg_prcp, avg_snow) %>%
  gather(key = "data_point", value="value", avg_prcp, avg_snow) %>%
  ggplot(aes(x=MONTH, y=value, fill=data_point)) +
    geom_col(na.rm=TRUE, alpha=0.7, color="black", position="dodge") +
    labs(x="Month", y="Amount (inches)")

Categorical Analysis

For this analysis, I sought to subset the data into 30-50 year periods as such:
* 1900-1940: Industrial Revolution
* 1940-1980: Cold War Era
* 1980-present: Modern Era

1900-1940

Temperature and Precipitation Boxplots

# filter weather data for years 1900-1940
industrial_era <- nyc_weather %>%
  dplyr::select(-MONTH) %>%
  filter(lubridate::year(DATE) >= 1900, lubridate::year(DATE) < 1940)

str(industrial_era)
## tibble [14,609 × 5] (S3: tbl_df/tbl/data.frame)
##  $ TMAX: int [1:14609] 20 23 26 32 39 40 43 40 33 41 ...
##  $ TMIN: int [1:14609] 14 13 19 19 29 32 31 20 15 30 ...
##  $ DATE: Date[1:14609], format: "1900-01-01" "1900-01-02" ...
##  $ PRCP: num [1:14609] 0.03 0 0 0 0 0 0 0.01 0 0 ...
##  $ SNOW: num [1:14609] 0.5 0 0 0 0 0 0 0 0 0 ...
industrial_era %>%
  ggplot() +
  geom_histogram(aes(x=TMAX, fill="TMAX"), na.rm=TRUE, alpha=0.8, fill="steelblue", bins=100, binwidth=2) +
  labs(title="Distribution of TMAX for Years 1900-1940", x="Temperature (°F)", y="Count")

industrial_era %>%
  dplyr::select(TMAX, TMIN, DATE) %>%
  gather(key="data_point", value="value", TMAX, TMIN) %>%
  ggplot(aes(y=value, fill=data_point)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
  labs(title="Distribution of TMAX and TMIN for Years 1900-1940", x="Data Point", y="Temperature (°F)")

industrial_era %>%
  dplyr::select(PRCP, SNOW, DATE) %>%
  gather(key="data_point", value="value", PRCP, SNOW) %>%
  filter(value > 0.0) %>%
  ggplot(aes(y=value, fill=data_point)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
  labs(title="Distribution of PRCP and SNOW for Years 1900-1940", x="Data Point", y="Amount (inches)")

1940-1980

Temperature and Precipitation Boxplots

coldwar_era <- nyc_weather %>%
  dplyr::select(-MONTH) %>%
  filter(lubridate::year(DATE) >= 1940, lubridate::year(DATE) < 1980)

str(coldwar_era)
## tibble [14,610 × 5] (S3: tbl_df/tbl/data.frame)
##  $ TMAX: int [1:14610] 24 28 33 32 29 30 26 24 29 34 ...
##  $ TMIN: int [1:14610] 15 17 17 20 16 14 12 19 20 18 ...
##  $ DATE: Date[1:14610], format: "1940-01-01" "1940-01-02" ...
##  $ PRCP: num [1:14610] 0 0 0 0 0.03 0 0 0.08 0 0 ...
##  $ SNOW: num [1:14610] 0 0 0 0 0.3 0 0 1 0 0 ...
coldwar_era %>%
  ggplot() +
  geom_histogram(aes(x=TMAX, fill="TMAX"), na.rm=TRUE, alpha=0.8, fill="steelblue", bins=100, binwidth=2) +
  labs(title="Distribution of TMAX for Years 1940-1980", x="Temperature (°F)", y="Count")

coldwar_era %>%
  dplyr::select(TMAX, TMIN, DATE) %>%
  gather(key="data_point", value="value", TMAX, TMIN) %>%
  ggplot(aes(y=value, fill=data_point)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
  labs(title="Distribution of TMAX and TMIN for Years 1940-1980", x="Data Point", y="Temperature (°F)")

coldwar_era %>%
  dplyr::select(PRCP, SNOW, DATE) %>%
  gather(key="data_point", value="value", PRCP, SNOW) %>%
  filter(value > 0.0) %>%
  ggplot(aes(y=value, fill=data_point)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
  labs(title="Distribution of PRCP and SNOW for Years 1940-1980", x="Data Point", y="Amount (inches)")

1980 - present

Temperature and Precipitation Boxplots

modern_era <- nyc_weather %>%
  dplyr::select(-MONTH) %>%
  filter(lubridate::year(DATE) >= 1980)

str(modern_era)
## tibble [15,610 × 5] (S3: tbl_df/tbl/data.frame)
##  $ TMAX: int [1:15610] 45 42 38 30 32 32 40 38 32 34 ...
##  $ TMIN: int [1:15610] 34 34 28 21 26 19 27 31 29 23 ...
##  $ DATE: Date[1:15610], format: "1980-01-01" "1980-01-02" ...
##  $ PRCP: num [1:15610] 0 0 0 0 0.07 0 0 0 0 0 ...
##  $ SNOW: num [1:15610] 0 0 0 0 2 0 0 0 0 0 ...
modern_era %>%
  ggplot() +
  geom_histogram(aes(x=TMAX, fill="TMAX"), na.rm=TRUE, alpha=0.8, fill="steelblue", bins=100, binwidth=2) +
  labs(title="Distribution of TMAX for Years 1980-Present", x="Temperature (°F)", y="Count")

modern_era %>%
  dplyr::select(TMAX, TMIN, DATE) %>%
  gather(key="data_point", value="value", TMAX, TMIN) %>%
  ggplot(aes(y=value, fill=data_point)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
  labs(title="Distribution of TMAX and TMIN for Years 1980-Present", x="Data Point", y="Temperature (°F)")

modern_era %>%
  dplyr::select(PRCP, SNOW, DATE) %>%
  gather(key="data_point", value="value", PRCP, SNOW) %>%
  filter(value > 0.0) %>%
  ggplot(aes(y=value, fill=data_point)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
  labs(title="Distribution of PRCP and SNOW for Years 1980-Present", x="Data Point", y="Amount (inches)")

ANOVA test

\(H_0:\) The average TMAX across the three eras is equal.
\(H_1:\) The average TMAX across the three eras is different.
\(\alpha\): 0.05

# modify data set by adding ERA column
industrial_era <- industrial_era %>%
  mutate(Era = 1)
coldwar_era <- coldwar_era %>%
  mutate(Era = 2)
modern_era <- modern_era %>%
  mutate(Era = 3)
# combine data frames
all_eras <- rbind(industrial_era, coldwar_era, modern_era)
# convert ERA column to factor/categorical
all_eras$Era <- factor(all_eras$Era, labels = c("Indst.", "Cold War", "Modern"))

# box plot
means <- aggregate(TMAX ~ Era, all_eras, mean)

all_eras %>%
  ggplot(aes(x=Era, y=TMAX, fill=Era)) +
  geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
  stat_summary_bin(fun="mean", geom="point", shape=20, size=5, show.legend=FALSE) +
  geom_text(data = means, aes(label=round(TMAX, 2) , y=TMAX - 5)) +
  labs(title="Distribution of Daily Maximum Temperature By Era", x="Era", y="Temperature (°F)") +
  scale_fill_brewer(palette="Dark2")

# run ANOVA
anova_res <- aov(TMAX ~ Era, data = all_eras)
summary(anova_res)
##                Df   Sum Sq Mean Sq F value Pr(>F)    
## Era             2    35301   17651      51 <2e-16 ***
## Residuals   44826 15516075     346                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
xkabledply(anova_res)
Model: TMAX ~ Era
Df Sum Sq Mean Sq F value Pr(>F)
Era 2 35301 17651 51 0
Residuals 44826 15516075 346 NA NA

With a p-value of 7.519^{-23}, we strongly reject the \(H_0\) that the average daily maximum temperature between the 3 eras is the same.

Tukey HSD

This tests will analyze the difference between means of the 3 eras:

tukeyAoV <- TukeyHSD(anova_res, conf.level=0.95)
tukeyAoV
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = TMAX ~ Era, data = all_eras)
## 
## $Era
##                  diff   lwr  upr p adj
## Cold War-Indst. 1.339 0.829 1.85     0
## Modern-Indst.   2.147 1.645 2.65     0
## Modern-Cold War 0.808 0.306 1.31     0
plot(tukeyAoV, col="red", cex.axis=0.75)

The Tukey tests shows that the greatest difference in average daily maximum temperature is between the Industrial and Modern eras.

Outlier Check on Precipitation

# extract PRCP outlier rows into df variable
# remove zeros
# group variable by decade, getting sum for each decade
prcp_no_zeros <- nyc_weather %>%
  filter(PRCP > 0.0 & !is.na(PRCP))
prcp_boxplot <- boxplot(prcp_no_zeros$PRCP, ylab = "Temperature (°F)")

prcp_outliers <- boxplot.stats(prcp_no_zeros$PRCP)$out
prcp_outliers.index <- which(prcp_no_zeros$PRCP %in% c(prcp_outliers))
prcp_outliers_df <- prcp_no_zeros[prcp_outliers.index, ]

prcp_outliers_df.yearly <- prcp_outliers_df %>%
  mutate(YEAR = lubridate::year(DATE)) %>%
  group_by(YEAR) %>%
  summarize(prcp_total = sum(PRCP, na.rm=T), count = n())

# show top 5 years
top_5_years.prcp_total <- prcp_outliers_df.yearly %>%
  slice_max(order_by = prcp_total, n = 5)

prcp_outliers_df.yearly %>%
  ggplot(aes(x=YEAR, y=prcp_total)) +
    geom_line(na.rm=TRUE, alpha=0.7, color="darkgreen") +
    geom_point(na.rm = TRUE, fill="#69b3a2", shape = 21) +
    geom_text_repel(aes(label=YEAR), data=top_5_years.prcp_total) +
    labs(x="Year", y="Total Yearly Amount (inches)") +
    scale_fill_brewer(palette="Spectral")

top_5_years.count <- prcp_outliers_df.yearly %>%
  slice_max(order_by = count, n = 5)

prcp_outliers_df.yearly %>%
  ggplot(aes(x=YEAR, y=count)) +
    geom_line(na.rm=TRUE, alpha=0.7, color="darkgreen") +
    geom_point(na.rm = TRUE, fill="#69b3a2", shape = 21) +
    geom_text_repel(aes(label=YEAR), data=top_5_years.count) +
    labs(title="Precipitation Outlier Days By Year", x="Year", y="Days with Outliers")

# group into decades and create bar plot
prcp_outliers_df.decades <- prcp_outliers_df %>%
  mutate(decade = paste(floor(lubridate::year(DATE) / 10) * 10, "s", sep="")) %>%
  group_by(decade) %>%
  summarise(prcp_total = sum(PRCP, na.rm=T), count = n())

prcp_outliers_df.decades %>%
  ggplot(aes(group=1)) +
  geom_col(aes(x=decade, y=prcp_total), na.rm=TRUE, alpha=0.8, color="black", fill="darkgreen", position="identity") +
  geom_line(aes(x=decade, y=count), stat="identity", color="#69b3a2", size=1) +
  labs(title="Precipitation Outlier Days And Total Precipitation By Decade", x="Decade", y="Total Amount (inches)") +
  scale_y_continuous(sec.axis=sec_axis(~., name="Days with Outliers"))

prcp_outliers_df.decades %>%
  ggplot(aes(x=decade, y=count)) +
    geom_col(na.rm=TRUE, alpha=0.7, color="black", fill="#56B4E9", position="identity") +
    labs(x="Decade", y="Days with Outliers")

Christmas Snow

# sum snowfall from 24-25th Dec
nyc_weather.christmas_snow <- nyc_weather %>%
  filter(lubridate::month(DATE) == 12, (lubridate::day(DATE) == 24 | lubridate::day(DATE) == 25)) %>%
  mutate(YEAR = lubridate::year(DATE)) %>%
  group_by(YEAR) %>%
  summarise(snowfall = sum(SNOW, na.rm = T))

# how many years has there been snow?
with_snow <- nyc_weather.christmas_snow %>%
  filter(snowfall > 0.0)

# highest snowfall year?
largest_snowfall <- nyc_weather.christmas_snow[which.max(nyc_weather.christmas_snow$snowfall), ]

# yearly plot for years it snowed

with_snow %>%
  ggplot(aes(x=YEAR, y=snowfall)) +
    geom_line(na.rm=TRUE, alpha=0.7, color="steelblue", size=1.25) +
    geom_point(na.rm = TRUE, fill="#00edfa", shape = 21) +
    geom_label_repel(aes(label=YEAR), data=with_snow) +
    labs(title="Years With Snowfall on Christmas Eve/Christmas Day", x="Year", y="Snowfall (inches)")

It has snowed on 24 out of 152 Christmases.

The largest snowfall recorded on Christmas/Christmas Eve was in 1912 when it snowed 11.4 inches!!

Importing Weather Data

weather_data <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))

Separating Date into Day/Mont/Year columns

weather_data$DATE <- as.Date(weather_data$DATE)

weather_data$YEAR <- as.numeric(format(weather_data$DATE, "%Y"))
weather_data$MONTH<- as.factor(format(weather_data$DATE, "%m"))
weather_data$DAY <- format(weather_data$DATE, "%d")

I broke out the date column into separate Day, Month, and Year columns. I did this to make the data easier to manipulate on a month or year level.

Subsetting Data and Finding Monthly Averages for Temp, Precip, and Snowfall

weather_100yr <- subset(weather_data, weather_data$YEAR > 1899 & weather_data$YEAR < 2022)

month_tmax_avg <- aggregate(TMAX ~ MONTH + YEAR, weather_data, mean)
month_tmin_avg <- aggregate(TMIN ~ MONTH + YEAR, weather_data, mean)

month_precip_avg <- aggregate(PRCP ~ MONTH + YEAR, weather_data, mean)
colnames(month_precip_avg)[3] <- "PRCP_AVG"

month_precip_total <- aggregate(PRCP ~ MONTH + YEAR, weather_data, sum)
colnames(month_precip_total)[3] <- "PRCP_TOT"

month_snow_avg <- aggregate(SNOW ~ MONTH + YEAR, weather_data, mean)
colnames(month_snow_avg)[3] <- "SNOW_AVG"

month_snow_total <- aggregate(SNOW ~ MONTH + YEAR, weather_data, sum)
colnames(month_snow_total)[3] <- "SNOT_TOT"


monthly_weather <- data.frame(month_tmax_avg, 
                                             month_tmin_avg$TMIN,
                                             month_precip_avg$PRCP_AVG,
                                             month_precip_total$PRCP_TOT,
                                             month_snow_avg$SNOW_AVG,
                                             month_snow_total$SNOT_TOT)
colnames(monthly_weather)[4:8] <- c("TMIN", "PRCP_AVG", "PRCP_TOT", "SNOW_AVG", "SNOW_TOT")

monthly_weather_100yr <- subset(monthly_weather, monthly_weather$YEAR > 1899 & monthly_weather$YEAR < 2022)

Found the monthly average for TMAX, TMIN, PRCP, and SNOW for all months from 1900-2022. Additionally found the total monthly PRCP and SNOW for all months from 1900 - 2022.

Doing the same for Yearly Data

year_tmax_avg <- aggregate(TMAX ~ YEAR, weather_data, mean)
year_tmin_avg <- aggregate(TMIN ~ YEAR, weather_data, mean)

year_precip_avg <- aggregate(PRCP ~ YEAR, weather_data, mean)
colnames(year_precip_avg)[2] <- "PRCP_AVG"

year_precip_total <- aggregate(PRCP ~ YEAR, weather_data, sum)
colnames(year_precip_total)[2] <- "PRCP_TOT"

year_snow_avg <- aggregate(SNOW ~ YEAR, weather_data, mean)
colnames(year_snow_avg)[2] <- "SNOW_AVG"

year_snow_total <- aggregate(SNOW ~ YEAR, weather_data, sum)
colnames(year_snow_total)[2] <- "SNOT_TOT"


yearly_weather <- data.frame(year_tmax_avg, 
                             year_tmin_avg$TMIN, 
                             year_precip_avg$PRCP_AVG, 
                             year_precip_total$PRCP_TOT, 
                             year_snow_avg$SNOW_AVG, 
                             year_snow_total$SNOT_TOT)
colnames(yearly_weather)[3:7] <- c("TMIN", "PRCP_AVG", "PRCP_TOT", "SNOW_AVG", "SNOW_TOT")


yearly_weather_100yr <- subset(yearly_weather, yearly_weather$YEAR > 1899 & yearly_weather$YEAR < 2022)

melt_year_data <- melt(yearly_weather_100yr, id.vars = "YEAR")

Found yearly averages for TMAX, TMIN, PRCP, and SNOW. Additionally found the yearly totals for PRCP and SNOW.

Statistical Analysis

Question 1:

Are there statistically measurable changes in weather patterns (e.g., temperature and precipitation levels) over time in New York City over this window?

  • What statistical test?
  • Does this require a linear model?

Using a Linear Regression to estimate the linear trend and statistical significance over time.

TMAX

#install.packages("gtsummary")

lm_tmax_yr <- lm(TMAX ~ YEAR, data = yearly_weather_100yr)
#lm_tmax_mtyr <- lm(TMAX ~ YEAR + MONTH, data = monthly_weather_100yr)
#lm_tmax_day <- lm(TMAX ~ YEAR, data = weather_100yr)

summary(lm_tmax_yr)
## 
## Call:
## lm(formula = TMAX ~ YEAR, data = yearly_weather_100yr)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.925 -0.827  0.076  0.746  3.190 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.5663     6.2657    2.01    0.047 *  
## YEAR          0.0253     0.0032    7.90  1.5e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.24 on 120 degrees of freedom
## Multiple R-squared:  0.342,  Adjusted R-squared:  0.337 
## F-statistic: 62.4 on 1 and 120 DF,  p-value: 1.49e-12
#summary(lm_tmax_mtyr)
#summary(lm_tmax_day)


coef_tmax_yr <- c(coefficients(lm_tmax_yr), summary(lm_tmax_yr)$adj.r.squared)
coef_tmax_yr
## (Intercept)        YEAR             
##     12.5664      0.0252      0.3367

The table above shows the slope and intercept of the TMAX linear model based on two different approaches.coef_tmax_day uses the raw daily TMAX for each year in the dataset while coef_tmax_yr uses the yearly tmax averages.

yr_tmax_scatter_line <- yearly_tmax_scatter+
  stat_smooth(method = "lm",
              formula = y ~ x,
              geom = "smooth",
              color = "black",
              show.legend = F)

yr_tmax_scatter_line

The regression line added to the yearly TMAX scatterplot. The slope of the line is 0.025. This implies the average daily maximum temperature is increasing by 0.025 degrees F per year, or by about 0.252 degrees F per decade.

TMIN

lm_tmin_yr <- lm(TMIN ~ YEAR, data = yearly_weather_100yr)
summary(lm_tmin_yr)
## 
## Call:
## lm(formula = TMIN ~ YEAR, data = yearly_weather_100yr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7039 -0.9296  0.0013  0.9041  3.1525 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.1490     6.0830   -0.52     0.61    
## YEAR          0.0256     0.0031    8.27  2.1e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.21 on 120 degrees of freedom
## Multiple R-squared:  0.363,  Adjusted R-squared:  0.358 
## F-statistic: 68.3 on 1 and 120 DF,  p-value: 2.13e-13
#lm_tmin_day <- lm(TMIN ~ YEAR, data = weather_100yr)
#summary(lm_tmin_day)


#coef_tmin_yr <- coefficients(lm_tmin_yr)
#coef_tmin_day <- coefficients(lm_tmin_day)

The table above shows the slope and intercept of the TMIN linear model based on two different approaches.coef_tmin_day uses the raw daily TMIN for each year in the dataset while coef_tmin_yr uses the yearly TMIN averages.

yr_tmin_scatter_line <- yearly_tmin_scatter +
  stat_smooth(method = "lm",
              formula = y ~ x,
              geom = "smooth",
              color = "black",
              show.legend = F)

yr_tmin_scatter_line

PRCP

lm_prcp_yr <- lm(PRCP_TOT ~ YEAR, data = yearly_weather_100yr)
lm_prcp_month <- lm(PRCP_TOT ~ YEAR + MONTH, data = monthly_weather_100yr)
lm_prcp_avg <- lm(PRCP_AVG ~ YEAR + MONTH, data = monthly_weather_100yr)

lm_prcp_day <- lm(PRCP ~ YEAR, data = weather_100yr)
summary(lm_prcp_month)
## 
## Call:
## lm(formula = PRCP_TOT ~ YEAR + MONTH, data = monthly_weather_100yr)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.477 -1.469 -0.281  1.040 14.193 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.79929    3.09631   -2.84  0.00455 ** 
## YEAR         0.00625    0.00158    3.96  7.7e-05 ***
## MONTH02     -0.16779    0.27196   -0.62  0.53736    
## MONTH03      0.58582    0.27196    2.15  0.03140 *  
## MONTH04      0.42787    0.27196    1.57  0.11587    
## MONTH05      0.44213    0.27196    1.63  0.10422    
## MONTH06      0.34311    0.27196    1.26  0.20728    
## MONTH07      0.87082    0.27196    3.20  0.00139 ** 
## MONTH08      0.98926    0.27196    3.64  0.00028 ***
## MONTH09      0.48172    0.27196    1.77  0.07672 .  
## MONTH10      0.33508    0.27196    1.23  0.21811    
## MONTH11      0.07344    0.27196    0.27  0.78716    
## MONTH12      0.33918    0.27196    1.25  0.21253    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.12 on 1451 degrees of freedom
## Multiple R-squared:  0.0323, Adjusted R-squared:  0.0243 
## F-statistic: 4.04 on 12 and 1451 DF,  p-value: 3.29e-06
summary(lm_prcp_yr)
## 
## Call:
## lm(formula = PRCP_TOT ~ YEAR, data = yearly_weather_100yr)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20.40  -5.59  -1.48   4.51  32.72 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -100.8708    42.4557   -2.38  0.01909 *  
## YEAR           0.0750     0.0217    3.46  0.00074 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.42 on 120 degrees of freedom
## Multiple R-squared:  0.0909, Adjusted R-squared:  0.0833 
## F-statistic:   12 on 1 and 120 DF,  p-value: 0.00074
summary(lm_prcp_avg)
## 
## Call:
## lm(formula = PRCP_AVG ~ YEAR + MONTH, data = monthly_weather_100yr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.1445 -0.0483 -0.0093  0.0345  0.4577 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.89e-01   1.01e-01   -2.85  0.00445 ** 
## YEAR         2.04e-04   5.16e-05    3.96    8e-05 ***
## MONTH02      4.94e-03   8.91e-03    0.55  0.57946    
## MONTH03      1.89e-02   8.91e-03    2.12  0.03403 *  
## MONTH04      1.80e-02   8.91e-03    2.02  0.04376 *  
## MONTH05      1.43e-02   8.91e-03    1.60  0.10953    
## MONTH06      1.51e-02   8.91e-03    1.70  0.08917 .  
## MONTH07      2.81e-02   8.91e-03    3.15  0.00164 ** 
## MONTH08      3.19e-02   8.91e-03    3.58  0.00035 ***
## MONTH09      1.98e-02   8.91e-03    2.22  0.02660 *  
## MONTH10      1.08e-02   8.91e-03    1.21  0.22510    
## MONTH11      6.16e-03   8.91e-03    0.69  0.48926    
## MONTH12      1.09e-02   8.91e-03    1.23  0.21948    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0696 on 1451 degrees of freedom
## Multiple R-squared:  0.0264, Adjusted R-squared:  0.0184 
## F-statistic: 3.28 on 12 and 1451 DF,  p-value: 0.000104
coef_prcp_yr <- c(coefficients(lm_prcp_yr), summary(lm_prcp_yr)$r.square)
coef_prcp_yr
## (Intercept)        YEAR             
##   -100.8708      0.0750      0.0909
#coef_prcp_day <- coefficients(lm_tmax_day)
#coef_table_prcp <- cbind(coef_prcp_day, coef_prcp_yr)
#coef_table_prcp

The table above shows the slope and intercept of the TMAX linear model based on two different approaches.`

yr_prcp_scatter_line <- yearly_prcp_scatter+
  stat_smooth(method = "lm",
              formula = y ~ x,
              geom = "smooth",
              color = "black",
              show.legend = F) + 
  labs(title = "Total Yearly Precipitation")


yr_prcp_scatter_line

SNOW

lm_snow_yr <- lm(SNOW_TOT ~ YEAR, data = yearly_weather_100yr)
summary(lm_snow_yr)
## 
## Call:
## lm(formula = SNOW_TOT ~ YEAR, data = yearly_weather_100yr)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -22.36 -11.88  -1.58   7.99  33.79 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.5785    70.5341   -0.21     0.84
## YEAR          0.0211     0.0360    0.59     0.56
## 
## Residual standard error: 14 on 120 degrees of freedom
## Multiple R-squared:  0.00286,    Adjusted R-squared:  -0.00545 
## F-statistic: 0.344 on 1 and 120 DF,  p-value: 0.559
coef_snow_yr <- coefficients(lm_snow_yr)
coef_snow_yr
## (Intercept)        YEAR 
##    -14.5785      0.0211

The table above shows the slope and intercept of the TMAX linear model based on two different approaches.coef_tmax_day uses the raw daily TMAX for each year in the dataset while coef_tmax_yr uses the yearly tmax averages.

yr_snow_scatter_line <- yearly_snow_scatter+
  stat_smooth(method = "lm",
              formula = y ~ x,
              geom = "smooth",
              color = "black",
              show.legend = F)+
  labs(title = "Total Yearly Snowfall") 


yr_snow_scatter_line

grid.arrange(yr_prcp_scatter_line, yr_snow_scatter_line,
             yr_tmax_scatter_line, yr_tmin_scatter_line,
             nrow = 2, 
             top = "Weather Patterns in Central Park, NYC, from 1900-2021")

Question 3

Do the observed changes in the weather data from New York City align with the documented rise in global air temperatures over the last century?

  • Compare rate of change observed in NYC to the global rate
  • What stat test?
  • Earth’s temperature has risen by 0.14° Fahrenheit (0.08° Celsius) per decade since 1880, but the rate of warming since 1981 is more than twice that: 0.32° F (0.18° C) per decade. (per climate.gov)

To compare the slope of TMAX from the NYC data set to the known global temp change, will look at confidence interval of the coefficients from the linear model.

weather1880 <- subset(monthly_weather, monthly_weather$YEAR > 1879 & monthly_weather$YEAR < 2022)
yearly_1880 <- subset(yearly_weather, yearly_weather$YEAR > 1879 & yearly_weather$YEAR < 2022)


yr1880_scatter <- ggplot(yearly_1880, aes(x = YEAR, y = TMAX)) + 
  geom_point() +
  labs(x = "Year", y = "Temperature (F)", title = "Yearly Average Maximum Temp")


yr1880_scatter

yr1880_line <- yr1880_scatter +
  stat_smooth(method = "lm",
              formula = y ~ x,
              geom = "smooth",
              col = "black") +
  labs(title = "NYC vs Global Temperature Change since 1880")
yr1880_line

yr1880_global <- yr1880_line + 
  geom_abline(aes(intercept = 33.2, slope = 0.014, col = "red")) +
  scale_color_discrete(name = "Locale", labels = c("Global"))


yr1880_global

lm1880 <- lm(TMAX ~ YEAR + MONTH, data = weather1880)
lm_yr1880 <- lm(TMAX ~ YEAR, data = yearly_1880)

summary(lm1880)
## 
## Call:
## lm(formula = TMAX ~ YEAR + MONTH, data = weather1880)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.470  -2.162   0.021   2.151  12.182 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -20.56997    3.82828   -5.37  8.8e-08 ***
## YEAR          0.03024    0.00196   15.45  < 2e-16 ***
## MONTH02       1.36947    0.39310    3.48  0.00051 ***
## MONTH03       9.67015    0.39310   24.60  < 2e-16 ***
## MONTH04      21.29680    0.39310   54.18  < 2e-16 ***
## MONTH05      32.36938    0.39310   82.34  < 2e-16 ***
## MONTH06      41.12661    0.39310  104.62  < 2e-16 ***
## MONTH07      46.02703    0.39310  117.09  < 2e-16 ***
## MONTH08      44.14562    0.39310  112.30  < 2e-16 ***
## MONTH09      37.49351    0.39310   95.38  < 2e-16 ***
## MONTH10      26.25897    0.39310   66.80  < 2e-16 ***
## MONTH11      14.47426    0.39310   36.82  < 2e-16 ***
## MONTH12       3.79827    0.39310    9.66  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.31 on 1691 degrees of freedom
## Multiple R-squared:  0.961,  Adjusted R-squared:  0.961 
## F-statistic: 3.51e+03 on 12 and 1691 DF,  p-value: <2e-16
summary(lm_yr1880)
## 
## Call:
## lm(formula = TMAX ~ YEAR, data = yearly_1880)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.168 -0.774  0.061  0.750  3.324 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.9014     5.0632    0.57     0.57    
## YEAR          0.0301     0.0026   11.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.27 on 140 degrees of freedom
## Multiple R-squared:  0.491,  Adjusted R-squared:  0.487 
## F-statistic:  135 on 1 and 140 DF,  p-value: <2e-16
weather1981 <- subset(monthly_weather, monthly_weather$YEAR > 1980 & monthly_weather$YEAR < 2022)
yearly_1981 <- subset(yearly_weather, yearly_weather$YEAR > 1980 & yearly_weather$YEAR < 2022)
all_1981 <- subset(weather_data, weather_data$YEAR > 1980 & weather_data$YEAR < 2022)

yr1981_scatter <- ggplot(yearly_1981, aes(x = YEAR, y = TMAX)) + 
  geom_point() +
  labs(x = "Year", y = "Temperature (F)", title = "Yearly Average Maximum Temp")
yr1981_scatter

yr1981_line <- yr1981_scatter +
  stat_smooth(method = "lm",
              formula = y ~ x,
              geom = "smooth",
              col = "black") +
  labs(title = "NYC vs Global Temperature Change since 1981")
yr1981_line

yr1981_global <- yr1981_line + 
  geom_abline(aes(intercept = -0.6, slope = 0.032, col = "red")) +
  scale_color_discrete(name = "Locale", labels = c("Global"))

yr1981_global

lm1981 <- lm(TMAX ~ YEAR + MONTH, data = weather1981)
lm_yearly1981 <- lm(TMAX ~ YEAR, data = yearly_1981)
lm_all1981 <- lm(TMAX ~ YEAR + MONTH, data = all_1981)
summary(lm1981)
## 
## Call:
## lm(formula = TMAX ~ YEAR + MONTH, data = weather1981)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.673  -1.923  -0.037   2.035  11.697 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.1657    24.4478    0.74     0.46    
## YEAR          0.0106     0.0122    0.86     0.39    
## MONTH02       3.2210     0.7081    4.55  6.8e-06 ***
## MONTH03      11.1857     0.7081   15.80  < 2e-16 ***
## MONTH04      22.6684     0.7081   32.01  < 2e-16 ***
## MONTH05      32.3674     0.7081   45.71  < 2e-16 ***
## MONTH06      40.7887     0.7081   57.61  < 2e-16 ***
## MONTH07      45.8442     0.7081   64.75  < 2e-16 ***
## MONTH08      44.1841     0.7081   62.40  < 2e-16 ***
## MONTH09      37.0709     0.7081   52.36  < 2e-16 ***
## MONTH10      25.6436     0.7081   36.22  < 2e-16 ***
## MONTH11      15.1497     0.7081   21.40  < 2e-16 ***
## MONTH12       5.1487     0.7081    7.27  1.5e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.21 on 479 degrees of freedom
## Multiple R-squared:  0.962,  Adjusted R-squared:  0.961 
## F-statistic: 1e+03 on 12 and 479 DF,  p-value: <2e-16
summary(lm_yearly1981) 
## 
## Call:
## lm(formula = TMAX ~ YEAR, data = yearly_1981)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.761 -0.615  0.189  0.885  2.746 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  41.5153    33.2338    1.25     0.22
## YEAR          0.0107     0.0166    0.65     0.52
## 
## Residual standard error: 1.26 on 39 degrees of freedom
## Multiple R-squared:  0.0106, Adjusted R-squared:  -0.0148 
## F-statistic: 0.418 on 1 and 39 DF,  p-value: 0.522
summary(lm_all1981)
## 
## Call:
## lm(formula = TMAX ~ YEAR + MONTH, data = all_1981)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31.76  -5.88  -0.23   5.63  35.55 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 17.53858   12.17530    1.44    0.150    
## YEAR         0.01087    0.00608    1.79    0.074 .  
## MONTH02      3.23265    0.35782    9.03   <2e-16 ***
## MONTH03     11.18568    0.34940   32.01   <2e-16 ***
## MONTH04     22.66842    0.35230   64.34   <2e-16 ***
## MONTH05     32.36743    0.34940   92.64   <2e-16 ***
## MONTH06     40.78875    0.35230  115.78   <2e-16 ***
## MONTH07     45.84422    0.34940  131.21   <2e-16 ***
## MONTH08     44.18411    0.34940  126.46   <2e-16 ***
## MONTH09     37.07086    0.35230  105.22   <2e-16 ***
## MONTH10     25.64359    0.34940   73.39   <2e-16 ***
## MONTH11     15.14972    0.35230   43.00   <2e-16 ***
## MONTH12      5.14870    0.34940   14.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.81 on 14962 degrees of freedom
## Multiple R-squared:  0.764,  Adjusted R-squared:  0.764 
## F-statistic: 4.04e+03 on 12 and 14962 DF,  p-value: <2e-16

Question 2:

Are there any notable (and statistically significant) changes in weather patterns after/during the pandemic lockdown (perhaps due to changes in commuting patterns)?

  • Compare 10 years prior to 2020 to 2 years after?
  • Chi-square test or 2-sample t-test?
covid_weather <- subset(monthly_weather, YEAR > 2010 & YEAR < 2021 &
                          (MONTH == "04" |MONTH == "05" | MONTH == "06" |MONTH == "07" |
                              MONTH == "08"))

covid_weather <- covid_weather %>%
  add_column(Lockdown = 
               ifelse(.$YEAR < 2020, "Pre-Lockdown", "Post-Lockdown"),
             .after = "YEAR")

covid_weather <- covid_weather %>%
  add_column(Season = 
               ifelse(.$MONTH == "04" | .$MONTH == "05", "Spring", "Summer"),
             .after = "YEAR")


covid_seasons <- aggregate(TMAX ~ Season + Lockdown, covid_weather, mean)
covid_seasons$Lockdown <- factor(covid_seasons$Lockdown, levels = c('Pre-Lockdown', 'Post-Lockdown'))

covid_bar2 <- ggplot(covid_seasons, aes(x = Season, y = TMAX, fill = Lockdown, decreasing = TRUE)) +
  geom_col(position = "dodge") +
  labs(title = "Seasonal Average Daily Rainfall Pre and Post COVID-19 Lockdown")

covid_bar2

covid_spring_summer <- subset(weather_data,
                           YEAR > 2010 &
                           YEAR <= 2020 &
                             (MONTH == "04" |MONTH == "05" | MONTH == "06" |
                                MONTH == "07" | MONTH == "08"))
covid_spring_summer_ld <- covid_spring_summer %>%
  add_column(Lockdown = 
               ifelse(.$YEAR < 2020, "Pre-Lockdown", "Post-Lockdown"),
             .after = "YEAR")
covid_spring_summer_ld$Lockdown <- factor(covid_spring_summer_ld$Lockdown, levels = c('Pre-Lockdown', 'Post-Lockdown'))

covid_ld_season <- covid_spring_summer_ld %>%
  add_column(Season = 
               ifelse(.$MONTH == "04" | .$MONTH == "05", "Spring", "Summer"),
             .after = "YEAR")

covid_boxplot <- ggplot()+
  geom_boxplot(data = covid_ld_season, aes(y = TMAX, x = Season, fill = Lockdown)) +
  labs(title = "Temperature Ranges Pre- and Post- COVID Lockdown")

covid_boxplot

pre_spring <- subset(weather_data,
                           YEAR > 2010 & YEAR < 2020 &
                           (MONTH == "04" |MONTH == "05"))


pre_summer <- subset(weather_data,
                           YEAR > 2010 & YEAR < 2020 &
                           (MONTH == "06" |MONTH == "07" |
                            MONTH == "08"))


covid_lockdown_spring <- subset(weather_data,
                           YEAR == 2020 &
                           (MONTH == "04" |MONTH == "05"))


covid_lockdown_summer <- subset(weather_data,
                           YEAR == 2020 &
                           (MONTH == "06" |MONTH == "07" |
                            MONTH == "08"))

cov_spring_ttest <- t.test(pre_spring$TMAX, covid_lockdown_spring$TMAX)
cov_spring_ttest
## 
##  Welch Two Sample t-test
## 
## data:  pre_spring$TMAX and covid_lockdown_spring$TMAX
## t = 3, df = 78, p-value = 0.005
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.15 6.33
## sample estimates:
## mean of x mean of y 
##      67.3      63.5
cov_summer_ttest <- t.test(pre_summer$TMAX, covid_lockdown_summer$TMAX)
cov_summer_ttest
## 
##  Welch Two Sample t-test
## 
## data:  pre_summer$TMAX and covid_lockdown_summer$TMAX
## t = -2, df = 119, p-value = 0.05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.4951  0.0265
## sample estimates:
## mean of x mean of y 
##      83.1      84.4

Linear regression of TMAX ~ year * month

maxTfit2 <- lm(formula = TMAX ~ year*month, data = NYweath_sub )
summary(maxTfit2)
## 
## Call:
## lm(formula = TMAX ~ year * month, data = NYweath_sub)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38.88  -5.86  -0.13   5.64  38.20 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.99525    5.67942    0.53  0.59793    
## year           0.01814    0.00292    6.22  5.1e-10 ***
## month02      -48.49021    8.22446   -5.90  3.7e-09 ***
## month03      -60.34742    7.99196   -7.55  4.4e-14 ***
## month04      -37.07059    8.05762   -4.60  4.2e-06 ***
## month05       19.85288    8.00082    2.48  0.01309 *  
## month06       47.84350    8.05762    5.94  2.9e-09 ***
## month07       39.66717    7.99196    4.96  6.9e-07 ***
## month08       22.68921    7.99196    2.84  0.00453 ** 
## month09       26.84293    8.06269    3.33  0.00087 ***
## month10       16.26197    8.02986    2.03  0.04285 *  
## month11      -31.06543    8.09646   -3.84  0.00012 ***
## month12      -36.39448    8.02986   -4.53  5.8e-06 ***
## year:month02   0.02563    0.00423    6.06  1.3e-09 ***
## year:month03   0.03592    0.00411    8.75  < 2e-16 ***
## year:month04   0.02993    0.00414    7.23  4.9e-13 ***
## year:month05   0.00641    0.00411    1.56  0.11870    
## year:month06  -0.00344    0.00414   -0.83  0.40539    
## year:month07   0.00328    0.00411    0.80  0.42388    
## year:month08   0.01103    0.00411    2.69  0.00725 ** 
## year:month09   0.00541    0.00414    1.31  0.19154    
## year:month10   0.00511    0.00413    1.24  0.21515    
## year:month11   0.02330    0.00416    5.60  2.1e-08 ***
## year:month12   0.02061    0.00413    4.99  5.9e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.87 on 56062 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.776,  Adjusted R-squared:  0.775 
## F-statistic: 8.42e+03 on 23 and 56062 DF,  p-value: <2e-16

The equations, including interaction terms, are:

January: -2.998 + 0.01814 year
February: -48.49 + -2.998 (0.02563 + 0.01814) year
March: -60.35 + -2.998 + (0.03592 + 0.01814) year
April: -37.07 + -2.998 + (0.02993 + 0.01814) year
May: 19.85 + -2.998 + (0.00641 + 0.01814) year
June: 47.84 + -2.998 + (0.02563 + 0.01814) year
July: 39.37 + -2.998 + (0.0328 + 0.01814) year
August: 22.69 + -2.998 + (0.01103 + 0.01814) year
September: 26.84 + -2.998 + (0.00541 + 0.01814) year
October: 16.26 + -2.998 + (0.00511 + 0.01814) year
November: -31.07 + -2.998 + (0.02330 + 0.01814) year
December: -36.39 + -2.998 + (0.02061 + 0.01814) year

Essentially the same r-squared here. The interaction terms are significant for February, March, April, August, November, and December, meaning that the change in temperature over time is likely different for these months. The warming is greatest in February, March, April, November, and December.

Now, let’s subset to 1900 to 2022.

Looking only at data from 1900 on

NYweath_00 <- subset (NYweath_sub, year > 1899) # subsetting to 20th century and later
xkabledplyhead(NYweath_00)
Head
DATE day month year TMAX TMIN TAVG PRCP SNOW
11265 1900-01-01 01 01 1900 20 14 NA 0.03 0.5
11266 1900-01-02 02 01 1900 23 13 NA 0.00 0.0
11267 1900-01-03 03 01 1900 26 19 NA 0.00 0.0
11268 1900-01-04 04 01 1900 32 19 NA 0.00 0.0
11269 1900-01-05 05 01 1900 39 29 NA 0.00 0.0

Linear regression of TMAX ~ year from 1900 on.

maxTfit00_0 <- lm(formula = TMAX ~ year, data = NYweath_00 )
summary(maxTfit00_0)
## 
## Call:
## lm(formula = TMAX ~ year, data = NYweath_00)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58.95 -15.34   1.04  16.22  44.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.80092    4.86335    2.22    0.026 *  
## year         0.02616    0.00248   10.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.6 on 44827 degrees of freedom
## Multiple R-squared:  0.00248,    Adjusted R-squared:  0.00245 
## F-statistic:  111 on 1 and 44827 DF,  p-value: <2e-16

Linear regression of TMAX ~ year + month from 1900 on.

maxTfit00_1 <- lm(formula = TMAX ~ year + month, data = NYweath_00 )
summary(maxTfit00_1)
## 
## Call:
## lm(formula = TMAX ~ year + month, data = NYweath_00)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39.37  -5.87  -0.17   5.62  37.48 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.05508    2.32357   -4.76  2.0e-06 ***
## year          0.02539    0.00118   21.47  < 2e-16 ***
## month02       1.41719    0.20808    6.81  9.8e-12 ***
## month03      10.18568    0.20318   50.13  < 2e-16 ***
## month04      21.47157    0.20487  104.81  < 2e-16 ***
## month05      32.31209    0.20318  159.03  < 2e-16 ***
## month06      40.86425    0.20487  199.47  < 2e-16 ***
## month07      46.03803    0.20318  226.59  < 2e-16 ***
## month08      44.18752    0.20318  217.48  < 2e-16 ***
## month09      37.47104    0.20492  182.86  < 2e-16 ***
## month10      26.49418    0.20360  130.13  < 2e-16 ***
## month11      14.62180    0.20529   71.22  < 2e-16 ***
## month12       3.74193    0.20360   18.38  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.87 on 44816 degrees of freedom
## Multiple R-squared:  0.773,  Adjusted R-squared:  0.773 
## F-statistic: 1.27e+04 on 12 and 44816 DF,  p-value: <2e-16

Linear regression of TMAX ~ year * month from 1900 on.

maxTfit00_2 <- lm(formula = TMAX ~ year * month, data = NYweath_00 )
summary(maxTfit00_2)
## 
## Call:
## lm(formula = TMAX ~ year * month, data = NYweath_00)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38.62  -5.86  -0.18   5.60  37.68 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   23.97638    7.92089    3.03  0.00247 ** 
## year           0.00753    0.00404    1.86  0.06229 .  
## month02      -91.34608   11.47314   -7.96  1.7e-15 ***
## month03      -49.69861   11.20184   -4.44  9.2e-06 ***
## month04      -54.37296   11.29480   -4.81  1.5e-06 ***
## month05        6.04397   11.20184    0.54  0.58951    
## month06       22.67663   11.29480    2.01  0.04468 *  
## month07       31.67848   11.20184    2.83  0.00469 ** 
## month08        8.40307   11.20184    0.75  0.45317    
## month09       31.76679   11.30382    2.81  0.00495 ** 
## month10       40.24554   11.26959    3.57  0.00036 ***
## month11      -26.86867   11.36423   -2.36  0.01807 *  
## month12      -64.98949   11.26959   -5.77  8.1e-09 ***
## year:month02   0.04730    0.00585    8.09  6.3e-16 ***
## year:month03   0.03054    0.00571    5.35  9.0e-08 ***
## year:month04   0.03868    0.00576    6.72  1.9e-11 ***
## year:month05   0.01340    0.00571    2.35  0.01901 *  
## year:month06   0.00927    0.00576    1.61  0.10729    
## year:month07   0.00732    0.00571    1.28  0.19981    
## year:month08   0.01825    0.00571    3.20  0.00140 ** 
## year:month09   0.00291    0.00576    0.50  0.61383    
## year:month10  -0.00702    0.00575   -1.22  0.22195    
## year:month11   0.02116    0.00579    3.65  0.00026 ***
## year:month12   0.03505    0.00575    6.10  1.1e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.85 on 44805 degrees of freedom
## Multiple R-squared:  0.774,  Adjusted R-squared:  0.774 
## F-statistic: 6.68e+03 on 23 and 44805 DF,  p-value: <2e-16

Plotting daily max temp since 2000 with color as month

ggplot(NYweath_00, aes(x = DATE, y = TMAX, color = month)) + 
    geom_point(alpha = 0.6) +
    scale_color_manual(values = c("01" = "purple4",
                                  "02" = "purple",
                                  "03" = "dodgerblue",
                                  "04" = "cyan4",
                                  "05" = "yellow",
                                  "06" = "darkgoldenrod1",
                                  "07" = "red",
                                  "08" = "orange", 
                                  "09" = "yellow",
                                  "10" = "green",
                                  "11" = "turquoise",
                                  "12" = "steelblue")) +
      labs(
        x = "Year",
        y = "Maximum Daily Temperature",
        title = "Maximum Daily Temperature in Central Park") +
    xlab(label = "Year") +
    ylab(label = "Maximum daily temperature") +
    ggtitle(label = "Maximum Daily Temperature in Central Park") +
    stat_smooth(method = "lm",
            formula = y ~ x,
            geom = "smooth",
            color = "black",
            show.legend = F)

## Plotting the same data, but removing all but January and July

ggplot(NYweath_00, aes(x = year, y = TMAX, color = month)) +
    geom_point() +
    scale_color_manual(values = c("01" = "purple4",
                                   "07" = "red"), na.value = NA) +
    geom_abline(aes(intercept = -11.05508, slope = 0.02539), col = "black", size = 1) + 
    geom_abline(aes(intercept = 34.98295, slope = 0.02539), col = "black", size = 1) +
  
    labs(
        x = "Year",
        y = "Maximum Daily Temperature",
        title = "Maximum Daily Temperature in Central Park") +
    xlab(label = "Year") +
    ylab(label = "Maximum daily temperature") +
    ggtitle(label = "Maximum Daily Temperature in Central Park")

#    stat_smooth(method = "lm",
#            formula = y ~ x,
#            geom = "smooth",
#            color = "black",
#            show.legend = F)

Greyscale daily max temps from 1900-2022

ggplot(NYweath_00, aes(x = DATE, y = TMAX, color = month)) + 
    geom_point(alpha = 0.7) +
    scale_color_manual(values = c("01" = "gray35",
                                  "02" = "gray35",
                                  "03" = "gray35",
                                  "04" = "gray35",
                                  "05" = "gray35",
                                  "06" = "gray35",
                                  "07" = "gray35",
                                  "08" = "gray35", 
                                  "09" = "gray35",
                                  "10" = "gray35",
                                  "11" = "gray35",
                                  "12" = "gray35")) +
      labs(
        x = "Year",
        y = "Maximum Daily Temperature",
        title = "Maximum Daily Temperature in Central Park") +
    xlab(label = "Year") +
    ylab(label = "Maximum daily temperature") +
    ggtitle(label = "Maximum Daily Temperature in Central Park") +
    stat_smooth(method = "lm",
            formula = y ~ x,
            geom = "smooth",
            color = "black",
            show.legend = F)

Subsetting from 1948 on

NYweath_48 <- subset (NYweath_sub, year > 1947) # subsetting to 1948 and later for comparison with JFK data
xkabledplyhead(NYweath_48)
Head
DATE day month year TMAX TMIN TAVG PRCP SNOW
28796 1948-01-01 01 01 1948 32 29 NA 0.71 0.1
28797 1948-01-02 02 01 1948 34 30 NA 0.87 0.4
28798 1948-01-03 03 01 1948 34 26 NA 0.00 0.0
28799 1948-01-04 04 01 1948 35 25 NA 0.02 0.1
28800 1948-01-05 05 01 1948 37 32 NA 0.00 0.0

Linear regression of TMAX ~ year from 1948 on.

maxTfit48_0 <- lm(formula = TMAX ~ year, data = NYweath_48 )
summary(maxTfit48_0)
## 
## Call:
## lm(formula = TMAX ~ year, data = NYweath_48)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53.73 -15.14   1.02  16.07  41.40 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 30.59725   10.28369    2.98   0.0029 **
## year         0.01619    0.00518    3.12   0.0018 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.5 on 27296 degrees of freedom
## Multiple R-squared:  0.000358,   Adjusted R-squared:  0.000321 
## F-statistic: 9.76 on 1 and 27296 DF,  p-value: 0.00178

Linear regression of TMAX ~ year + month from 1948 on.

maxTfit48_1 <- lm(formula = TMAX ~ year + month, data = NYweath_48 )
summary(maxTfit48_1)
## 
## Call:
## lm(formula = TMAX ~ year + month, data = NYweath_48)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31.44  -5.89  -0.16   5.59  36.11 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.92079    4.92933    2.22    0.027 *  
## year         0.01406    0.00248    5.67  1.5e-08 ***
## month02      2.69999    0.26567   10.16  < 2e-16 ***
## month03     10.87570    0.25944   41.92  < 2e-16 ***
## month04     22.65256    0.26159   86.60  < 2e-16 ***
## month05     32.68774    0.25944  126.00  < 2e-16 ***
## month06     41.24278    0.26159  157.66  < 2e-16 ***
## month07     46.40688    0.25944  178.88  < 2e-16 ***
## month08     44.77290    0.25944  172.58  < 2e-16 ***
## month09     37.54036    0.26171  143.44  < 2e-16 ***
## month10     26.49689    0.26031  101.79  < 2e-16 ***
## month11     15.31521    0.26249   58.35  < 2e-16 ***
## month12      4.62418    0.26031   17.76  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.85 on 27285 degrees of freedom
## Multiple R-squared:  0.771,  Adjusted R-squared:  0.771 
## F-statistic: 7.64e+03 on 12 and 27285 DF,  p-value: <2e-16

Linear regression of TMAX ~ year * month from 1948 on.

maxTfit48_2 <- lm(formula = TMAX ~ year*month, data = NYweath_48 )
summary(maxTfit48_2)
## 
## Call:
## lm(formula = TMAX ~ year * month, data = NYweath_48)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31.41  -5.90  -0.18   5.61  35.66 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.45328   16.80071    0.38  0.70090    
## year           0.01631    0.00846    1.93  0.05396 .  
## month02      -34.24256   24.32884   -1.41  0.15929    
## month03      -53.57555   23.75979   -2.25  0.02415 *  
## month04        4.05564   23.95697    0.17  0.86557    
## month05       43.95305   23.75979    1.85  0.06434 .  
## month06       88.80876   23.95697    3.71  0.00021 ***
## month07       87.58973   23.75979    3.69  0.00023 ***
## month08       64.73312   23.75979    2.72  0.00644 ** 
## month09       52.56168   23.98838    2.19  0.02845 *  
## month10      112.18612   23.99956    4.67    3e-06 ***
## month11       37.04712   24.20267    1.53  0.12585    
## month12      -65.33324   23.99956   -2.72  0.00649 ** 
## year:month02   0.01861    0.01226    1.52  0.12888    
## year:month03   0.03247    0.01197    2.71  0.00668 ** 
## year:month04   0.00937    0.01207    0.78  0.43757    
## year:month05  -0.00568    0.01197   -0.47  0.63539    
## year:month06  -0.02396    0.01207   -1.99  0.04709 *  
## year:month07  -0.02075    0.01197   -1.73  0.08304 .  
## year:month08  -0.01006    0.01197   -0.84  0.40084    
## year:month09  -0.00757    0.01208   -0.63  0.53117    
## year:month10  -0.04318    0.01209   -3.57  0.00036 ***
## year:month11  -0.01095    0.01219   -0.90  0.36918    
## year:month12   0.03525    0.01209    2.92  0.00355 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.83 on 27274 degrees of freedom
## Multiple R-squared:  0.771,  Adjusted R-squared:  0.771 
## F-statistic: 4e+03 on 23 and 27274 DF,  p-value: <2e-16

I wonder if we’ll get something different if we look at weather in a more built environment. Starting a new RMD file for this (JFK airport).

JFK Data!

JFKweath <- data.frame(read.csv("data/JFK_weather_1949-2022.csv"))
xkablesummary(JFKweath)
Table: Statistics summary.
STATION NAME LATITUDE LONGITUDE ELEVATION DATE ACMH ACMH_ATTRIBUTES ACSH ACSH_ATTRIBUTES AWND AWND_ATTRIBUTES DAPR DAPR_ATTRIBUTES EVAP EVAP_ATTRIBUTES FMTM FMTM_ATTRIBUTES MDPR MDPR_ATTRIBUTES PGTM PGTM_ATTRIBUTES PRCP PRCP_ATTRIBUTES SNOW SNOW_ATTRIBUTES SNWD SNWD_ATTRIBUTES TAVG TAVG_ATTRIBUTES TMAX TMAX_ATTRIBUTES TMIN TMIN_ATTRIBUTES TOBS TOBS_ATTRIBUTES TSUN TSUN_ATTRIBUTES WDF1 WDF1_ATTRIBUTES WDF2 WDF2_ATTRIBUTES WDF5 WDF5_ATTRIBUTES WDFG WDFG_ATTRIBUTES WDFM WDFM_ATTRIBUTES WDMV WDMV_ATTRIBUTES WESD WESD_ATTRIBUTES WSF1 WSF1_ATTRIBUTES WSF2 WSF2_ATTRIBUTES WSF5 WSF5_ATTRIBUTES WSFG WSFG_ATTRIBUTES WSFM WSFM_ATTRIBUTES WT01 WT01_ATTRIBUTES WT02 WT02_ATTRIBUTES WT03 WT03_ATTRIBUTES WT04 WT04_ATTRIBUTES WT05 WT05_ATTRIBUTES WT06 WT06_ATTRIBUTES WT07 WT07_ATTRIBUTES WT08 WT08_ATTRIBUTES WT09 WT09_ATTRIBUTES WT11 WT11_ATTRIBUTES WT13 WT13_ATTRIBUTES WT14 WT14_ATTRIBUTES WT15 WT15_ATTRIBUTES WT16 WT16_ATTRIBUTES WT17 WT17_ATTRIBUTES WT18 WT18_ATTRIBUTES WT21 WT21_ATTRIBUTES WT22 WT22_ATTRIBUTES WV01 WV01_ATTRIBUTES
Min Length:27104 Length:27104 Min. :40.6 Min. :-73.8 Min. :2.7 Length:27104 Min. : 0 Length:27104 Min. : 0 Length:27104 Min. : 0 Length:27104 Min. :16 Length:27104 Mode:logical Mode:logical Min. : 0 Length:27104 Min. : 0 Length:27104 Min. : 0 Length:27104 Min. :0 Length:27104 Min. : 0 Length:27104 Min. : 0 Length:27104 Min. : 8 Length:27104 Min. : 8.0 Length:27104 Min. :-2.0 Length:27104 Mode:logical Mode:logical Min. : 0 Length:27104 Min. : 10 Length:27104 Min. : 10 Length:27104 Min. : 0 Length:27104 Min. : 23 Length:27104 Min. :360 Length:27104 Mode:logical Mode:logical Min. :0 Length:27104 Min. : 6 Length:27104 Min. : 7 Length:27104 Min. : 0 Length:27104 Min. : 8 Length:27104 Min. : 9 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104 Min. :1 Length:27104
Q1 Class :character Class :character 1st Qu.:40.6 1st Qu.:-73.8 1st Qu.:2.7 Class :character 1st Qu.: 30 Class :character 1st Qu.: 30 Class :character 1st Qu.: 8 Class :character 1st Qu.:27 Class :character NA’s:27104 NA’s:27104 1st Qu.: 1102 Class :character 1st Qu.: 2 Class :character 1st Qu.:1023 Class :character 1st Qu.:0 Class :character 1st Qu.: 0 Class :character 1st Qu.: 0 Class :character 1st Qu.:42 Class :character 1st Qu.: 48.0 Class :character 1st Qu.:34.0 Class :character NA’s:27104 NA’s:27104 1st Qu.: 0 Class :character 1st Qu.:160 Class :character 1st Qu.:160 Class :character 1st Qu.:160 Class :character 1st Qu.:180 Class :character 1st Qu.:360 Class :character NA’s:27104 NA’s:27104 1st Qu.:0 Class :character 1st Qu.:15 Class :character 1st Qu.: 17 Class :character 1st Qu.: 21 Class :character 1st Qu.:20 Class :character 1st Qu.:12 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character 1st Qu.:1 Class :character
Median Mode :character Mode :character Median :40.6 Median :-73.8 Median :2.7 Mode :character Median : 60 Mode :character Median : 70 Mode :character Median : 11 Mode :character Median :28 Mode :character NA NA Median : 1526 Mode :character Median : 3 Mode :character Median :1458 Mode :character Median :0 Mode :character Median : 0 Mode :character Median : 0 Mode :character Median :56 Mode :character Median : 62.0 Mode :character Median :47.0 Mode :character NA NA Median : 0 Mode :character Median :200 Mode :character Median :210 Mode :character Median :210 Mode :character Median :225 Mode :character Median :360 Mode :character NA NA Median :0 Mode :character Median :17 Mode :character Median : 21 Mode :character Median : 26 Mode :character Median :24 Mode :character Median :15 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character Median :1 Mode :character
Mean NA NA Mean :40.6 Mean :-73.8 Mean :2.7 NA Mean : 58 NA Mean : 61 NA Mean : 11 NA Mean :28 NA NA NA Mean : 1439 NA Mean : 4 NA Mean :1384 NA Mean :0 NA Mean : 0 NA Mean : 0 NA Mean :56 NA Mean : 61.5 NA Mean :47.2 NA NA NA Mean : 160 NA Mean :212 NA Mean :215 NA Mean :215 NA Mean :231 NA Mean :360 NA NA NA Mean :0 NA Mean :19 NA Mean : 22 NA Mean : 27 NA Mean :26 NA Mean :15 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA Mean :1 NA
Q3 NA NA 3rd Qu.:40.6 3rd Qu.:-73.8 3rd Qu.:2.7 NA 3rd Qu.: 80 NA 3rd Qu.: 90 NA 3rd Qu.: 13 NA 3rd Qu.:30 NA NA NA 3rd Qu.: 1850 NA 3rd Qu.: 4 NA 3rd Qu.:1837 NA 3rd Qu.:0 NA 3rd Qu.: 0 NA 3rd Qu.: 0 NA 3rd Qu.:71 NA 3rd Qu.: 77.0 NA 3rd Qu.:62.0 NA NA NA 3rd Qu.: 0 NA 3rd Qu.:300 NA 3rd Qu.:310 NA 3rd Qu.:310 NA 3rd Qu.:315 NA 3rd Qu.:360 NA NA NA 3rd Qu.:0 NA 3rd Qu.:22 NA 3rd Qu.: 26 NA 3rd Qu.: 32 NA 3rd Qu.:30 NA 3rd Qu.:18 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA 3rd Qu.:1 NA
Max NA NA Max. :40.6 Max. :-73.8 Max. :2.7 NA Max. :100 NA Max. :100 NA Max. :308 NA Max. :31 NA NA NA Max. :32767 NA Max. :17 NA Max. :2359 NA Max. :8 NA Max. :30 NA Max. :28 NA Max. :91 NA Max. :104.0 NA Max. :82.0 NA NA NA Max. :2832 NA Max. :360 NA Max. :360 NA Max. :360 NA Max. :360 NA Max. :360 NA NA NA Max. :6 NA Max. :52 NA Max. :314 NA Max. :214 NA Max. :71 NA Max. :21 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA Max. :1 NA
NA NA NA NA NA NA NA NA’s :15663 NA NA’s :15662 NA NA’s :12953 NA NA’s :27008 NA NA NA NA’s :16980 NA NA’s :27008 NA NA’s :14837 NA NA’s :2890 NA NA’s :3172 NA NA’s :3172 NA NA’s :20961 NA NA’s :2 NA NA’s :2 NA NA NA NA’s :27073 NA NA’s :15668 NA NA’s :17455 NA NA’s :17538 NA NA’s :21983 NA NA’s :27103 NA NA NA NA’s :21033 NA NA’s :15665 NA NA’s :17455 NA NA’s :17539 NA NA’s :19641 NA NA’s :27102 NA NA’s :19021 NA NA’s :25713 NA NA’s :25673 NA NA’s :26713 NA NA’s :26730 NA NA’s :26896 NA NA’s :26992 NA NA’s :21556 NA NA’s :26924 NA NA’s :27090 NA NA’s :24919 NA NA’s :26217 NA NA’s :27062 NA NA’s :19315 NA NA’s :27044 NA NA’s :25677 NA NA’s :27099 NA NA’s :27055 NA NA’s :27103 NA
#converting to R date format and adding columns for day, month, and year
JFKweath$DATE <- as.Date(JFKweath$DATE)
JFKweath$day <- format(JFKweath$DATE, format="%d")
JFKweath$month <- format(JFKweath$DATE, format="%m")
JFKweath$year <- format(JFKweath$DATE, format="%Y")

#converting temperature observations to numerics
JFKweath$TMAX <- as.numeric(JFKweath$TMAX)
JFKweath$TMIN <- as.numeric(JFKweath$TMIN)
JFKweath$TAVG <- as.numeric(JFKweath$TAVG)
JFKweath$year <- as.numeric(JFKweath$year)

#Making month a factor
NYweath$month <- as.factor(NYweath$month)

#creating a subset of the weather data
JFKweath_sub <- subset(JFKweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW)) 
xkablesummary(JFKweath_sub)
Table: Statistics summary.
DATE day month year TMAX TMIN TAVG PRCP SNOW
Min Min. :1948-07-17 Length:27104 Length:27104 Min. :1948 Min. : 8.0 Min. :-2.0 Min. : 8 Min. :0 Min. : 0
Q1 1st Qu.:1967-02-03 Class :character Class :character 1st Qu.:1967 1st Qu.: 48.0 1st Qu.:34.0 1st Qu.:42 1st Qu.:0 1st Qu.: 0
Median Median :1985-08-23 Mode :character Mode :character Median :1985 Median : 62.0 Median :47.0 Median :56 Median :0 Median : 0
Mean Mean :1985-08-23 NA NA Mean :1985 Mean : 61.5 Mean :47.2 Mean :56 Mean :0 Mean : 0
Q3 3rd Qu.:2004-03-12 NA NA 3rd Qu.:2004 3rd Qu.: 77.0 3rd Qu.:62.0 3rd Qu.:71 3rd Qu.:0 3rd Qu.: 0
Max Max. :2022-09-30 NA NA Max. :2022 Max. :104.0 Max. :82.0 Max. :91 Max. :8 Max. :30
NA NA NA NA NA NA’s :2 NA’s :2 NA’s :20961 NA’s :2890 NA’s :3172

Multiple linear regression for JFK from 1948 on

jmaxTfit1 <- lm(formula = TMAX ~ year + month, data = JFKweath_sub )
summary(jmaxTfit1)
## 
## Call:
## lm(formula = TMAX ~ year + month, data = JFKweath_sub)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32.57  -5.17  -0.15   5.04  36.63 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -28.21756    4.45988   -6.33  2.5e-10 ***
## year          0.03380    0.00224   15.06  < 2e-16 ***
## month02       2.02149    0.23936    8.45  < 2e-16 ***
## month03       9.32084    0.23372   39.88  < 2e-16 ***
## month04      19.76311    0.23566   83.86  < 2e-16 ***
## month05      29.41606    0.23375  125.84  < 2e-16 ***
## month06      38.83293    0.23566  164.78  < 2e-16 ***
## month07      44.45024    0.23334  190.49  < 2e-16 ***
## month08      43.24112    0.23295  185.63  < 2e-16 ***
## month09      36.60503    0.23487  155.85  < 2e-16 ***
## month10      26.04513    0.23373  111.43  < 2e-16 ***
## month11      15.05592    0.23567   63.88  < 2e-16 ***
## month12       4.75950    0.23376   20.36  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.92 on 27089 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.795,  Adjusted R-squared:  0.795 
## F-statistic: 8.78e+03 on 12 and 27089 DF,  p-value: <2e-16